import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
data = r"C:\Users\laxma\Downloads\Employee.csv"
data = pd.read_csv(data)
data.shape
(4653, 9)
data.head()
| Education | JoiningYear | City | PaymentTier | Age | Gender | EverBenched | ExperienceInCurrentDomain | LeaveOrNot | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Bachelors | 2017 | Bangalore | 3 | 34 | Male | No | 0 | 0 |
| 1 | Bachelors | 2013 | Pune | 1 | 28 | Female | No | 3 | 1 |
| 2 | Bachelors | 2014 | New Delhi | 3 | 38 | Female | No | 2 | 0 |
| 3 | Masters | 2016 | Bangalore | 3 | 27 | Male | No | 5 | 1 |
| 4 | Masters | 2017 | Pune | 3 | 24 | Male | Yes | 2 | 1 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4653 entries, 0 to 4652 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Education 4653 non-null object 1 JoiningYear 4653 non-null int64 2 City 4653 non-null object 3 PaymentTier 4653 non-null int64 4 Age 4653 non-null int64 5 Gender 4653 non-null object 6 EverBenched 4653 non-null object 7 ExperienceInCurrentDomain 4653 non-null int64 8 LeaveOrNot 4653 non-null int64 dtypes: int64(5), object(4) memory usage: 327.3+ KB
data['LeaveOrNot'].value_counts()
0 3053 1 1600 Name: LeaveOrNot, dtype: int64
data.duplicated().sum()
1889
data.drop_duplicates(inplace=True)
data.duplicated().sum()
0
data.describe()
| JoiningYear | PaymentTier | Age | ExperienceInCurrentDomain | LeaveOrNot | |
|---|---|---|---|---|---|
| count | 2764.000000 | 2764.000000 | 2764.000000 | 2764.000000 | 2764.000000 |
| mean | 2015.090449 | 2.636035 | 30.952967 | 2.644356 | 0.393632 |
| std | 1.885943 | 0.624001 | 5.108872 | 1.610610 | 0.488643 |
| min | 2012.000000 | 1.000000 | 22.000000 | 0.000000 | 0.000000 |
| 25% | 2013.000000 | 2.000000 | 27.000000 | 1.000000 | 0.000000 |
| 50% | 2015.000000 | 3.000000 | 30.000000 | 2.000000 | 0.000000 |
| 75% | 2017.000000 | 3.000000 | 35.000000 | 4.000000 | 1.000000 |
| max | 2018.000000 | 3.000000 | 41.000000 | 7.000000 | 1.000000 |
data.columns
Index(['Education', 'JoiningYear', 'City', 'PaymentTier', 'Age', 'Gender',
'EverBenched', 'ExperienceInCurrentDomain', 'LeaveOrNot'],
dtype='object')
#VISUALIZATION
plt.bar(data['JoiningYear'],data['City'])
plt.xticks(rotation=90)
plt.show()
fig=px.violin(data,x='Gender',y='Age',color='Gender')
fig.show()
plt.scatter(data['ExperienceInCurrentDomain'],data['Age'],color='red')
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(10,4))
sns.countplot(x='EverBenched', data=data, color='b')
plt.show()
plt.figure(figsize=(10,4))
top_car = data['LeaveOrNot'].value_counts().nlargest(10)
sns.countplot(y=data.LeaveOrNot,color='red')
<AxesSubplot:xlabel='count', ylabel='LeaveOrNot'>
sns.lineplot(x='City', y='LeaveOrNot', data=data)
<AxesSubplot:xlabel='City', ylabel='LeaveOrNot'>
sns.barplot(data['PaymentTier'],data['Education'],color='cyan')
plt.xticks(rotation=90)
plt.show()
sns.countplot(x='JoiningYear',data=data)
<AxesSubplot:xlabel='JoiningYear', ylabel='count'>
sns.boxplot(x='ExperienceInCurrentDomain',y='EverBenched',data=data)
<AxesSubplot:xlabel='ExperienceInCurrentDomain', ylabel='EverBenched'>
sns.violinplot(x='EverBenched',y='LeaveOrNot',data=data)
<AxesSubplot:xlabel='EverBenched', ylabel='LeaveOrNot'>
#MODEL BUILDING
categorical = [col for col in data.columns if data[col].dtypes == 'O']
categorical
['Education', 'City', 'Gender', 'EverBenched']
import category_encoders as ce
encoder = ce.OneHotEncoder(cols=['Education', 'City','Gender','EverBenched'])
data = encoder.fit_transform(data)
data.head()
| Education_1 | Education_2 | Education_3 | JoiningYear | City_1 | City_2 | City_3 | PaymentTier | Age | Gender_1 | Gender_2 | EverBenched_1 | EverBenched_2 | ExperienceInCurrentDomain | LeaveOrNot | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 2017 | 1 | 0 | 0 | 3 | 34 | 1 | 0 | 1 | 0 | 0 | 0 |
| 1 | 1 | 0 | 0 | 2013 | 0 | 1 | 0 | 1 | 28 | 0 | 1 | 1 | 0 | 3 | 1 |
| 2 | 1 | 0 | 0 | 2014 | 0 | 0 | 1 | 3 | 38 | 0 | 1 | 1 | 0 | 2 | 0 |
| 3 | 0 | 1 | 0 | 2016 | 1 | 0 | 0 | 3 | 27 | 1 | 0 | 1 | 0 | 5 | 1 |
| 4 | 0 | 1 | 0 | 2017 | 0 | 1 | 0 | 3 | 24 | 1 | 0 | 0 | 1 | 2 | 1 |
X = data.drop('LeaveOrNot', axis=1) #independent
y = data['LeaveOrNot'] #dependent
X.head()
| Education_1 | Education_2 | Education_3 | JoiningYear | City_1 | City_2 | City_3 | PaymentTier | Age | Gender_1 | Gender_2 | EverBenched_1 | EverBenched_2 | ExperienceInCurrentDomain | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 2017 | 1 | 0 | 0 | 3 | 34 | 1 | 0 | 1 | 0 | 0 |
| 1 | 1 | 0 | 0 | 2013 | 0 | 1 | 0 | 1 | 28 | 0 | 1 | 1 | 0 | 3 |
| 2 | 1 | 0 | 0 | 2014 | 0 | 0 | 1 | 3 | 38 | 0 | 1 | 1 | 0 | 2 |
| 3 | 0 | 1 | 0 | 2016 | 1 | 0 | 0 | 3 | 27 | 1 | 0 | 1 | 0 | 5 |
| 4 | 0 | 1 | 0 | 2017 | 0 | 1 | 0 | 3 | 24 | 1 | 0 | 0 | 1 | 2 |
y.head()
0 0 1 1 2 0 3 1 4 1 Name: LeaveOrNot, dtype: int64
!pip install XGBoost
Collecting XGBoost Using cached xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB) Requirement already satisfied: numpy in d:\anaconda files\lib\site-packages (from XGBoost) (1.24.4) Requirement already satisfied: scipy in d:\anaconda files\lib\site-packages (from XGBoost) (1.9.1) Installing collected packages: XGBoost Successfully installed XGBoost-2.0.3
import xgboost as xgb
#define data_dmatrix
data_dmatrix = xgb.DMatrix(data=X, label=y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
from xgboost import XGBClassifier
params = {
'objective':'binary:logistic',
'max_depth':4,
'aplha':10,
'learning_rate':1.0,
'n_estimators':100
}
xgb_clf = XGBClassifier(**params)
xgb_clf.fit(X_train, y_train)
XGBClassifier(aplha=10, base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=1.0, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=4, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, ...)
print(xgb_clf)
XGBClassifier(aplha=10, base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=1.0, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=4, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, ...)
y_pred = xgb_clf.predict(X_test)
from sklearn.metrics import accuracy_score
print('XGBoost model accuracy score:{0:0.4f}'. format(accuracy_score(y_test, y_pred)))
XGBoost model accuracy score:0.7518